We first want to load the datasets for businesses, users and reviews.
#load businesses
lines <- readLines("yelp_academic_dataset_business.json")
business_full <- lapply(lines, fromJSON)
rm(lines)
business_full <- data.frame(do.call('rbind', business_full))
business_full <- business_full %>%
mutate(state = as.character(state), business_id = as.character(business_id))
The naming of the city in the dataset is at the postal code level. We want to make the grouping at the metropolitan level so that we can classify tourists vs. locals. We use the grouping that yelp provided.
unique(business_full$state)
#some states seem to be misclassified
View(filter(business_full, state == "NM")[1:10,]) # Las Vegas in NM, remove
View(filter(business_full, state == "TX")[1:10,]) # Dallas in Texas, remove
View(filter(business_full, state == "EDH")[1:10,]) # EDH is Edinburgh
View(filter(business_full, state == "MLN")[1:10,]) # MLN is Edinburgh
View(filter(business_full, state == "FIF")[1:10,]) # close to Edinburgh, classify as Edinburgh
View(filter(business_full, state == "ELN")[1:10,]) # close to Edinburgh, classify as Edinburgh
View(filter(business_full, state == "BW")[1:10,]) # Karlsruhe
View(filter(business_full, state == "RP")[1:10,]) # Karlsruhe
View(filter(business_full, state == "KHL")[1:10,]) # Edinburgh
View(filter(business_full, state == "NW")[1:10,]) # Karlsruhe
#group businesses by cities listed in yelp dataset
yelp_city <- matrix(c("PA", "Pittsburgh",
"NC", "Charlotte",
"SC", "Charlotte",
"WI", "Madison",
"IL", "Urbana-Champaign",
"AZ", "Phoenix",
"NV", "Las Vegas",
"QC", "Montreal",
"ON", "Waterloo",
"EDH", "Edinburgh" ,
"MLN", "Edinburgh",
"ELN", "Edinburgh",
"BW", "Karlsruhe",
"RP", "Karlsruhe"),
ncol = 2, byrow = TRUE)
colnames(yelp_city) <- c("state", "yelp_city")
yelp_city <- data.frame(yelp_city)
#join with business
business_full <- left_join(business_full, yelp_city, by = "state")
business_full <- business_full %>%
filter(!is.na(yelp_city)) %>%
mutate(state = as.character(state))
#some businesses are in more than one category
#unpack the categories for each business
business_categories <- cbind(unlist(rep(business_full$business_id,lapply(business_full$categories,length))),
unlist(business_full$categories))
colnames(business_categories) <- c("business_id", "category")
business_categories <- data.frame(business_categories)
#spread to get one row per business with columns for each category
#this series of code will turn the dataset into wide format with categories as a logical variable
business_categories_wide <- data.frame(business_categories, 1)
business_categories_wide <- business_categories_wide %>% spread(category, X1)
business_categories_wide[is.na(business_categories_wide)] <- 0
#write.csv(business_categories, file = "business_categories.csv")
#get the categories as a list
cats <- data.frame(unlist(business_full$categories))
colnames(cats) <- c("categories")
#count how many times each category appears
#arrange by count
cats <- cats %>%
group_by(categories) %>%
summarise(n = n()) %>%
ungroup %>%
arrange(desc(n))
#pick the first 100 categories
high_cats <- cats$categories[1:100]
high_cats <- as.character(high_cats)
#join with business data frame
business_wide <- select(business_full, business_id, yelp_city) %>%
left_join(select(business_categories_wide, business_id, one_of(high_cats)))
We first convert the users dataset into a .csv file. This allows the file to load faster for subsequent uses.
lines <- readLines("yelp_academic_dataset_user.json")
users_full <- lapply(lines, fromJSON)
rm(lines)
users_full <- data.frame(do.call('rbind', users_full))
users_full <- apply(users_full, 2, as.character)
users_full <- data.frame(users_full)
write.csv(users_full, file = "users_full.csv")
#load users
users <- read_csv("users_full.csv")
We had to set up the reviews dataset to convert to .csv in several steps because it was too large.
chunky <- function(lines,file_name) {
jsoned <- lapply(lines, fromJSON)
df <- data.frame(do.call('rbind',jsoned))
df <- df[,-c(1,7)]
df$user_id <- as.character(df$user_id)
df$review_id <- as.character(df$review_id)
df$stars <- as.numeric(df$stars)
df$date <- as.character(df$date)
df$text <- as.character(df$text)
df$business_id <- as.character(df$business_id)
write_csv(df,file_name)
return("done!")
}
chunk_wrapper <- function(lines) {
n <- length(lines)
for(i in 1:ceiling(n/250000)) {
start_row <- 1+(i-1)*250000
end_row <- i*250000
if(i==ceiling(n/250000)) { end_row = n}
chunky(lines[start_row:end_row],paste0("reviews_",start_row,"_",end_row,".csv"))
}
return("really done!")
}
#combine the files to get reviews.csv
reviews_full <- read_csv("reviews_full.csv")
write.csv(reviews, file = "reviews_notext.csv")
#load reviews dataset
reviews <- read_csv("reviews_notext.csv")[-1]
Our dataset does not have the information regarding the home city of each user. In this section, we attempt to make a classification of tourist vs. local for each user-city combination using some of the information we have.
To this end, we create some variable for each user such as number of places reviewed in each city (locals have more reviews in their home city), standard deviation (locals will have reviewed places in their home city over a longer period of time) and the number of reviews in each category for each user (some categories like “Home Services” might be reviewed more by locals).
#join reviews and businesses
joined <- left_join(select(reviews, user_id, business_id, date), business_wide, by = "business_id")
#add number of categories reviewed in each city by each user
sum_categories <- select(joined, -business_id, -date) %>%
group_by(user_id, yelp_city) %>%
summarize_each(funs(sum)) %>%
ungroup
#get the number of places reviewed in each city and the standard deviation of the dates
joined <- select(joined, user_id, yelp_city, date) %>%
left_join(select(users, user_id, review_count)) %>%
group_by(user_id, yelp_city) %>%
summarize(review_city = n(), sd_dates = sd(as.Date(date)))
## Joining by: "user_id"
#join to the table with summed categories
joined <- left_join(joined, sum_categories)
## Joining by: c("user_id", "yelp_city")
#join to users table
joined <- joined %>%
left_join(select(users, user_id, review_count)) %>%
ungroup
## Joining by: "user_id"
#reorder columns and rename review_count for users
joined <- joined[,c(1:3, 105, 4:104)]
colnames(joined)[4] <- "review_total"
#set SD dates that are NA's to 0
joined$sd_dates[is.na(joined$sd_dates)] <- 0
#remove users that have 10 or less reviews
grouped_users <- joined %>% filter(review_total > 10)
#check if there are still users that have only reviewed in one city
grouped_users %>% ungroup %>%summarise(n = sum(review_city == review_total))
## Source: local data frame [1 x 1]
##
## n
## (int)
## 1 2107
#remove users that have only reviewed in one city
#because they will have 100% of their reviews one city but this doesn't help with classifying them
grouped_users <- grouped_users %>%
ungroup %>%
filter(review_city != review_total)
#create columns that convert number of reviews into proportions
grouped_users <- grouped_users %>%
mutate(percent_city = review_city/review_total) %>% na.omit
Now, we will assign some of the users that we can be fairly certain are tourist or locals as such and build a regression model to eventually apply the model to the rest of the dataset.
To assign tourist/local designation to users, we use a conservative threshold for what percent of their total reviews would need to be in a particular city for them to be considered a tourist or a local in that city. We set 0.1 as the threshold for being a tourist i.e. users would need to have done 10% or less of their reviews in a city to be considered tourists in that city. And we set 0.8 as the threshold for being a local i.e. users would need to have done 80% or more of their reviews in a city to be considered locals in that city.
#create tourist vs local column
grouped_users <- grouped_users %>%
mutate(tourist = ifelse(percent_city < 0.1, 1, ifelse(percent_city > 0.8, 0, NA)))
#keep only the ones that have been classified as tourists or locals
grouped_users_tl <- grouped_users %>% filter(!is.na(tourist))
grouped_users_tl %>% group_by(tourist) %>% summarise(num = n(), review_count = mean(review_total),
city_count = mean(review_city))
## Source: local data frame [2 x 4]
##
## tourist num review_count city_count
## (dbl) (int) (dbl) (dbl)
## 1 0 11540 31.96187 28.798527
## 2 1 145146 109.24637 2.524127
We fit a logistic regression model with and without a LASSO penalty. We want to check if the LASSO penalty would prevent the model from overfitting to the training set by regularizing the contributions of some of the parameters.
set.seed(202)
#create train and test dataset
inTrain <- createDataPartition(y = grouped_users_tl$tourist, p=0.5)
train_set_tl <- slice(select(grouped_users_tl, -user_id, -yelp_city, -review_city, -review_total, -percent_city),
inTrain$Resample1)
test_set_tl <- slice(select(grouped_users_tl, -user_id, -yelp_city, -review_city, -review_total, -percent_city),
-inTrain$Resample1)
#run glm model
glm.tourist <- glm(tourist ~ ., family = binomial, data = train_set_tl)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm.tourist)
##
## Call:
## glm(formula = tourist ~ ., family = binomial, data = train_set_tl)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.4904 0.0431 0.0805 0.1035 8.4904
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.9961532 0.0635861 94.300 < 2e-16 ***
## sd_dates -0.0005344 0.0001606 -3.328 0.000874 ***
## Restaurants -0.3559045 0.0273510 -13.012 < 2e-16 ***
## Shopping -0.4466946 0.0704644 -6.339 2.31e-10 ***
## Food -0.0764235 0.0576321 -1.326 0.184820
## `Beauty & Spas` -1.3396638 0.1477410 -9.068 < 2e-16 ***
## `Health & Medical` -1.4586177 0.1787024 -8.162 3.29e-16 ***
## Nightlife 0.1283229 0.0865783 1.482 0.138298
## `Home Services` -1.7926465 0.1469037 -12.203 < 2e-16 ***
## Bars -0.1505227 0.0974534 -1.545 0.122453
## Automotive -1.8088780 0.1574875 -11.486 < 2e-16 ***
## `Local Services` -0.9933200 0.1370997 -7.245 4.32e-13 ***
## `Active Life` -0.1939946 0.0736374 -2.634 0.008427 **
## Fashion 0.0286625 0.1847261 0.155 0.876694
## `Event Planning & Services` -0.5447381 0.1343232 -4.055 5.00e-05 ***
## `Fast Food` -0.2417276 0.0723695 -3.340 0.000837 ***
## Pizza -0.4444443 0.0541094 -8.214 < 2e-16 ***
## Mexican -0.4854461 0.0458183 -10.595 < 2e-16 ***
## `Hotels & Travel` 1.7877953 0.1395213 12.814 < 2e-16 ***
## `American (Traditional)` 0.2273066 0.0450313 5.048 4.47e-07 ***
## Sandwiches 0.1624486 0.0583723 2.783 0.005386 **
## `Arts & Entertainment` -0.0277359 0.0583915 -0.475 0.634788
## `Coffee & Tea` -0.3578845 0.0695350 -5.147 2.65e-07 ***
## `Hair Salons` -0.1625889 0.1569397 -1.036 0.300204
## Italian -0.0809937 0.0568670 -1.424 0.154370
## Burgers -0.2321842 0.0562344 -4.129 3.65e-05 ***
## `Auto Repair` -0.3034827 0.2232134 -1.360 0.173954
## Doctors -0.5740310 0.2906361 -1.975 0.048259 *
## `Nail Salons` -0.5166313 0.1664683 -3.103 0.001913 **
## Chinese -0.2399499 0.0569412 -4.214 2.51e-05 ***
## `American (New)` 0.0406571 0.0396477 1.025 0.305146
## `Home & Garden` -0.2086535 0.2085807 -1.000 0.317142
## Pets -1.4473477 0.3629759 -3.987 6.68e-05 ***
## `Fitness & Instruction` -1.1791621 0.2267109 -5.201 1.98e-07 ***
## Hotels 0.0701798 0.2059056 0.341 0.733228
## Grocery -0.5059714 0.1116880 -4.530 5.89e-06 ***
## `Real Estate` 0.5764424 0.4740113 1.216 0.223949
## `Breakfast & Brunch` -0.0321472 0.0450359 -0.714 0.475343
## Dentists 0.0695466 0.5317114 0.131 0.895935
## `Specialty Food` 0.0915576 0.1090006 0.840 0.400923
## `Women's Clothing` -0.0410449 0.2423902 -0.169 0.865534
## Bakeries -0.0647634 0.0837086 -0.774 0.439122
## `Professional Services` -0.7543446 0.2846755 -2.650 0.008053 **
## `Ice Cream & Frozen Yogurt` -0.3039911 0.1028826 -2.955 0.003129 **
## Cafes -0.0865165 0.0789680 -1.096 0.273258
## `Financial Services` -2.2000231 0.5215550 -4.218 2.46e-05 ***
## Pubs 0.0089577 0.0777705 0.115 0.908302
## `Pet Services` -1.0749644 0.4890239 -2.198 0.027936 *
## Japanese 0.0499204 0.0667985 0.747 0.454866
## `General Dentistry` -0.3002220 0.6420350 -0.468 0.640064
## `Sports Bars` -0.5210206 0.0928551 -5.611 2.01e-08 ***
## `Sushi Bars` -0.4623925 0.0690043 -6.701 2.07e-11 ***
## Apartments -0.9298809 0.5508328 -1.688 0.091385 .
## Education -0.1402660 0.2952759 -0.475 0.634763
## `Convenience Stores` -0.1537277 0.2431506 -0.632 0.527235
## Gyms 0.1344937 0.3031612 0.444 0.657305
## `Sporting Goods` -0.0669325 0.1965260 -0.341 0.733421
## `Skin Care` -0.3530776 0.2272197 -1.554 0.120207
## `Cosmetics & Beauty Supply` 1.0494256 0.2653154 3.955 7.64e-05 ***
## Desserts 0.5921127 0.0864382 6.850 7.38e-12 ***
## `Chicken Wings` -0.4010382 0.1179227 -3.401 0.000672 ***
## Delis -0.0589924 0.1035867 -0.569 0.569018
## `Day Spas` 0.8371774 0.1647131 5.083 3.72e-07 ***
## `Hair Removal` 0.0703769 0.2031577 0.346 0.729031
## Seafood -0.0642442 0.0678862 -0.946 0.343970
## Drugstores 0.2839653 0.2507534 1.132 0.257446
## `Men's Clothing` 0.5906090 0.2877055 2.053 0.040090 *
## Massage 0.9645520 0.2165619 4.454 8.43e-06 ***
## Tires -0.3877702 0.3017994 -1.285 0.198841
## Accessories 0.4028619 0.2991755 1.347 0.178118
## `Flowers & Gifts` -0.0667495 0.2526263 -0.264 0.791609
## Lounges 0.0033027 0.0879820 0.038 0.970056
## Steakhouses 0.1541856 0.0638148 2.416 0.015686 *
## Jewelry -0.5788581 0.2351399 -2.462 0.013826 *
## `Books, Mags, Music & Video` 0.6927312 0.1462881 4.735 2.19e-06 ***
## `Oil Change Stations` -0.2475667 0.3008700 -0.823 0.410601
## `Arts & Crafts` 0.5610681 0.2686067 2.089 0.036725 *
## `Department Stores` 1.0071455 0.2299165 4.380 1.18e-05 ***
## Mediterranean -0.2991872 0.0870239 -3.438 0.000586 ***
## `Dry Cleaning & Laundry` -0.4472951 0.3625446 -1.234 0.217290
## Barbeque -0.3156261 0.0795624 -3.967 7.28e-05 ***
## `Gas & Service Stations` 2.4907501 0.3512135 7.092 1.32e-12 ***
## Barbers -0.6382606 0.2854472 -2.236 0.025352 *
## Trainers 0.1986394 0.3304376 0.601 0.547746
## `Asian Fusion` 0.1721267 0.0832548 2.067 0.038690 *
## `Banks & Credit Unions` 1.4511201 0.6336881 2.290 0.022024 *
## `Public Services & Government` 0.6302393 0.2481155 2.540 0.011082 *
## Thai -0.1409520 0.0773182 -1.823 0.068301 .
## `Beer, Wine & Spirits` 0.6774906 0.1526553 4.438 9.08e-06 ***
## `Furniture Stores` 0.4865864 0.3202224 1.520 0.128630
## `Pet Groomers` 1.2077524 0.4582890 2.635 0.008405 **
## Veterinarians -0.5742026 0.4085322 -1.406 0.159865
## `Auto Parts & Supplies` 0.6821131 0.3307159 2.063 0.039157 *
## `Venues & Event Spaces` 1.0030304 0.1970093 5.091 3.56e-07 ***
## `Cosmetic Dentists` -0.5274160 0.5938378 -0.888 0.374461
## French 0.4656792 0.0953207 4.885 1.03e-06 ***
## `Performing Arts` -0.0650986 0.1071084 -0.608 0.543332
## Optometrists -0.0753107 0.4370995 -0.172 0.863205
## `Shoe Stores` 0.3869707 0.3394732 1.140 0.254321
## `Juice Bars & Smoothies` -1.0022978 0.1310382 -7.649 2.03e-14 ***
## Indian -0.4971457 0.1151667 -4.317 1.58e-05 ***
## Buffets 0.6656441 0.0740122 8.994 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41119.3 on 78342 degrees of freedom
## Residual deviance: 6957.9 on 78241 degrees of freedom
## AIC: 7161.9
##
## Number of Fisher Scoring iterations: 9
#test accuracy on train and test set
glm_train_prob <- round(predict(glm.tourist, train_set_tl, type = "response"))
glm_test_prob <- round(predict(glm.tourist, test_set_tl, type = "response"))
#Confusion Matrix for train set
tab <- table(glm_train_prob, train_set_tl$tourist)
conf_matrix <- confusionMatrix(tab)
conf_matrix$table
##
## glm_train_prob 0 1
## 0 5041 350
## 1 712 72240
conf_matrix$overall["Accuracy"]
## Accuracy
## 0.9864442
#Confusion Matrix for test set
tab <- table(glm_test_prob, test_set_tl$tourist)
conf_matrix <- confusionMatrix(tab)
conf_matrix$table
##
## glm_test_prob 0 1
## 0 5106 333
## 1 681 72223
conf_matrix$overall["Accuracy"]
## Accuracy
## 0.9870569
#glmnet needs covariates in a matrix form
x_train <- as.matrix(train_set_tl %>% select(-tourist))
x_test <-as.matrix(test_set_tl %>% select(-tourist))
#run LASSO without tuning
tourist.lasso <-glmnet(x_train, y = as.factor(train_set_tl$tourist),alpha=1,family='binomial')
plot(tourist.lasso,xvar="lambda")
title(main = "Coefficients at different Log Lambda", cex.main = 1, line = 3)
#use cross validation to find optimal value of lambda (penalty parameter)
cv.lasso <- cv.glmnet(x_train, y=train_set_tl$tourist, alpha=1)
plot(cv.lasso)
title( main = "CV MSE vs Log Lambda", cex.main = 1, line = 3)
best_lambda_l <- cv.lasso$lambda.min
#run LASSO with the best_lambda
tourist.lasso_best<- glmnet(x_train, as.factor(train_set_tl$tourist),
alpha=1,family='binomial', lambda = best_lambda_l)
tourist.lasso_best$beta
## 101 x 1 sparse Matrix of class "dgCMatrix"
## s0
## sd_dates -0.0006501903
## Restaurants -0.2632622002
## Shopping -0.0694594072
## Food -0.0360946505
## Beauty & Spas -0.6903812798
## Health & Medical -1.3312458319
## Nightlife .
## Home Services -1.5028226374
## Bars .
## Automotive -1.2656647039
## Local Services -1.0471013875
## Active Life -0.1346487800
## Fashion .
## Event Planning & Services .
## Fast Food -0.1396992714
## Pizza -0.4235741575
## Mexican -0.4817459367
## Hotels & Travel 1.1314021621
## American (Traditional) .
## Sandwiches .
## Arts & Entertainment .
## Coffee & Tea -0.3294576670
## Hair Salons -0.3775843728
## Italian -0.0512481322
## Burgers -0.1794248379
## Auto Repair -0.5040639276
## Doctors -0.1998561637
## Nail Salons -0.7240229431
## Chinese -0.1880691645
## American (New) .
## Home & Garden -0.1503610862
## Pets -1.3040733898
## Fitness & Instruction -0.8157408934
## Hotels .
## Grocery -0.2725365212
## Real Estate .
## Breakfast & Brunch .
## Dentists .
## Specialty Food .
## Women's Clothing .
## Bakeries -0.0272475788
## Professional Services -0.3369412284
## Ice Cream & Frozen Yogurt -0.1768340910
## Cafes -0.0712638030
## Financial Services -0.6723577007
## Pubs .
## Pet Services -0.1169474396
## Japanese .
## General Dentistry -0.0908984387
## Sports Bars -0.3460003215
## Sushi Bars -0.3667584284
## Apartments .
## Education .
## Convenience Stores .
## Gyms .
## Sporting Goods -0.1572037469
## Skin Care .
## Cosmetics & Beauty Supply .
## Desserts 0.2652994267
## Chicken Wings -0.2996425654
## Delis .
## Day Spas 0.2437537259
## Hair Removal -0.1562492100
## Seafood .
## Drugstores .
## Men's Clothing .
## Massage 0.0048033395
## Tires -0.0577359388
## Accessories .
## Flowers & Gifts -0.0484954929
## Lounges .
## Steakhouses 0.0108847204
## Jewelry -0.1654896365
## Books, Mags, Music & Video .
## Oil Change Stations -0.1086798316
## Arts & Crafts .
## Department Stores .
## Mediterranean -0.2728774954
## Dry Cleaning & Laundry .
## Barbeque -0.2553735746
## Gas & Service Stations 0.7927575380
## Barbers -0.6419860465
## Trainers .
## Asian Fusion .
## Banks & Credit Unions .
## Public Services & Government .
## Thai -0.1893477768
## Beer, Wine & Spirits 0.0798811809
## Furniture Stores .
## Pet Groomers .
## Veterinarians -0.1981414369
## Auto Parts & Supplies .
## Venues & Event Spaces 0.2159680224
## Cosmetic Dentists -0.0751386487
## French 0.2752264223
## Performing Arts .
## Optometrists .
## Shoe Stores .
## Juice Bars & Smoothies -0.7789313205
## Indian -0.4201318893
## Buffets 0.4424507694
#test accuracy on train and test set
lasso_train_prob <- round(predict(tourist.lasso_best, x_train, type = "response"))
lasso_test_prob <- round(predict(tourist.lasso_best, x_test, type = "response"))
#Confusion Matrix for train set
tab <- table(lasso_train_prob, train_set_tl$tourist)
conf_matrix <- confusionMatrix(tab)
conf_matrix$table
##
## lasso_train_prob 0 1
## 0 4923 327
## 1 830 72263
conf_matrix$overall["Accuracy"]
## Accuracy
## 0.9852316
#Confusion Matrix for test set
tab <- table(lasso_test_prob, test_set_tl$tourist)
conf_matrix <- confusionMatrix(tab)
conf_matrix$table
##
## lasso_test_prob 0 1
## 0 4955 332
## 1 832 72224
conf_matrix$overall["Accuracy"]
## Accuracy
## 0.9851423
The accuracies are similar for the two methods and the LASSO shrinks most of the coefficients that were non-significant from the GLM. We decided to got with the GLM model without the LASSO penalty based on the slightly higher accuracy in the test dataset.
To make prediction on the full dataset, we use conservative thresholds on the results of the regression as well. We decided to only classify as tourists if the predicted probability of being a tourist is greater than 0.6 and classify as locals if the predicted probability is less than 0.4.
#apply logistic regression model to the full dataset
grouped_users_pred <- grouped_users %>%
mutate(pred_tourist_prob = predict(glm.tourist, grouped_users, type = "response"))
#classify as tourist, local or unsure (if between 0.4 and 0.6)
grouped_users_pred <- grouped_users_pred %>%
mutate(pred_tourist = ifelse(pred_tourist_prob > 0.6, "tourist",
ifelse(pred_tourist_prob <0.4, "local", "unsure")))
#"Confusion Matrix"
grouped_users_pred %>%
filter(!is.na(pred_tourist)) %>%
mutate(tourist = ifelse(tourist == 1, "tourist",
ifelse(tourist == 0, "local", NA))) %>%
filter(!is.na(tourist)) %>%
group_by(tourist) %>%
summarise(Correct = sum(tourist == pred_tourist),
Incorrect= sum(tourist != pred_tourist))
## Source: local data frame [2 x 3]
##
## tourist Correct Incorrect
## (chr) (int) (int)
## 1 local 9873 1667
## 2 tourist 144342 804
#check percent of reviews in the city and the total number of reviews for users in each category
grouped_users_pred %>% group_by(pred_tourist) %>% summarise(percent_city = mean(percent_city),
review_total = mean(review_total))
## Source: local data frame [3 x 3]
##
## pred_tourist percent_city review_total
## (chr) (dbl) (dbl)
## 1 local 0.67934736 61.82995
## 2 tourist 0.09171367 88.67800
## 3 unsure 0.52425656 46.90095
This model is based on a forced initial classification (based on <0.1, >0.8 thresholds) and while it can be justified and our thresholds were pretty conservative, there might be several issues because we do not have a way of verifying our thresholds.
For instance, our initial classification might be affected by the number of total reviews for each user. We see here that most people classified as tourists are the ones that generally have more total reviews. Some of them might be frequent travelers and they might review in their home cities just as often as their travel cities.
Also, from our confusion matrices, we can see that our models are skewed by the tourists since we have a lot more tourists than locals in our dataset (tourists can be from any part of the world, locals need to be from one of the 10 cities in the dataset). So although the model seems to have a high accuracy overall, the error is much larger among the locals.
We ran some spot checks to see how well the model did by reading some of the actual reviews in Montreal and Las Vegas.
#montreal locals and tourists
montreal_local <- grouped_users_pred %>% filter(yelp_city == "Montreal" & pred_tourist == "local")
montreal_tourist <- grouped_users_pred %>% filter(yelp_city == "Montreal" & pred_tourist == "tourist")
vegas_local <- grouped_users_pred %>% filter(yelp_city == "Las Vegas" & pred_tourist == "local")
vegas_tourist <- grouped_users_pred %>% filter(yelp_city == "Las Vegas" & pred_tourist == "tourist")
#MONTREAL LOCALS
smpl <- sample(seq(1, nrow(montreal_local), 1), 10)
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_local$user_id[smpl[1]],],select(business,yelp_city,
business_id))
#can't tell, in French
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_local$user_id[smpl[2]],],select(business,yelp_city,
business_id))
#might be local
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_local$user_id[smpl[3]],],select(business,yelp_city,
business_id))
#probably local
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_local$user_id[smpl[4]],],select(business,yelp_city,
business_id))
#probably local
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_local$user_id[smpl[5]],],select(business,yelp_city,
business_id))
#local
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_local$user_id[smpl[6]],],select(business,yelp_city,
business_id))
#French
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_local$user_id[smpl[7]],],select(business,yelp_city,
business_id))
#French, probably local
#MONTEREAL TOURISTS
smpl <- sample(seq(1, nrow(montreal_tourist), 1), 10)
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_tourist$user_id[smpl[1]],],select(business,yelp_city,
business_id))
#incorrect
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_tourist$user_id[smpl[2]],],select(business,yelp_city,
business_id))
#correct
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_tourist$user_id[smpl[3]],],select(business,yelp_city,
business_id))
#probably correct
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_tourist$user_id[smpl[4]],],select(business,yelp_city,
business_id))
#correct
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_tourist$user_id[smpl[5]],],select(business,yelp_city,
business_id))
#seems correct, local in Champagne, frequently visits Montreal userID CX4GcrCCnzfNyAl0cPqGbg
test_reviews <- left_join(reviews[reviews_full$user_id==montreal_tourist$user_id[smpl[6]],],select(business,yelp_city,
business_id))
#correct
#VEGAS LOCALS
smpl <- sample(seq(1, nrow(vegas_local), 1), 10)
test_reviews <- left_join(reviews[reviews_full$user_id==vegas_local$user_id[smpl[1]],],select(business,yelp_city,
business_id))
#correct
test_reviews <- left_join(reviews[reviews_full$user_id==vegas_local$user_id[smpl[2]],],select(business,yelp_city,
business_id))
#seems correct
test_reviews <- left_join(reviews[reviews_full$user_id==vegas_local$user_id[smpl[3]],],select(business,yelp_city,
business_id))
#correct
test_reviews <- left_join(reviews[reviews_full$user_id==vegas_local$user_id[smpl[4]],],select(business,yelp_city,
business_id))
#correct
test_reviews <- left_join(reviews[reviews_full$user_id==vegas_local$user_id[smpl[5]],],select(business,yelp_city,
business_id))
#correct
#VEGAS TOURISTS
smpl <- sample(seq(1, nrow(vegas_tourist), 1), 10)
test_reviews <- left_join(reviews[reviews_full$user_id==vegas_tourist$user_id[smpl[1]],],select(business,yelp_city,
business_id))
#correct
test_reviews <- left_join(reviews[reviews_full$user_id==vegas_tourist$user_id[smpl[2]],],select(business,yelp_city,
business_id))
#seems correct
test_reviews <- left_join(reviews[reviews_full$user_id==vegas_tourist$user_id[smpl[3]],],select(business,yelp_city,
business_id))
#correct
test_reviews <- left_join(reviews[reviews_full$user_id==vegas_tourist$user_id[smpl[4]],],select(business,yelp_city,
business_id))
#correct, review in German
test_reviews <- left_join(reviews[reviews_full$user_id==vegas_tourist$user_id[smpl[5]],],select(business,yelp_city,
business_id))
#seems incorrect
test_reviews <- left_join(reviews[reviews_full$user_id==vegas_tourist$user_id[smpl[6]],],select(business,yelp_city,
business_id))
#seems incorrect
test_reviews <- left_join(reviews[reviews_full$user_id==vegas_tourist$user_id[smpl[7]],],select(business,yelp_city,
business_id))
#seems correct
#save users that have been classified
users_trstvsloc <- grouped_users_pred %>% select(user_id, yelp_city, pred_tourist)
#write.csv(users_trstvsloc, file = "user_touristvlocal.csv")
#long format with cities as columns
users_trstvsloc_spread <- users_trstvsloc %>% spread(yelp_city, pred_tourist)
#write.csv(users_trstvsloc_spread, file = "user_touristvlocal_spread.csv")
#select users that have been classified as local in at least one city and tourist in at least one
yc_locals_trst <- users_trstvsloc %>%
group_by(user_id) %>%
mutate(n=n()) %>%
ungroup %>%
filter(n>1) %>%
filter(pred_tourist == "local")
#some users are locals in more than 1 city
twice_locals <- yc_locals_trst %>% group_by(user_id) %>% summarise(n = n()) %>% filter(n>1)
#check how many cities they've been classified in
twice_locals <- users_trstvsloc %>%
filter(user_id %in% twice_locals$user_id) %>%
group_by(user_id) %>% mutate(n = n()) %>% unique
#people that are not classified as tourists anywhere
twice_locals_only <- unique(filter(twice_locals, n<3)$user_id)
#filter out people that are not classified as tourists anywhere
yc_locals_trst <- yc_locals_trst %>% filter(!(user_id %in% twice_locals_only))
#wide version of the same data with cities as columns
locals_tourist <- right_join(users_trstvsloc_spread, select(yc_locals_trst, user_id))
## Joining by: "user_id"
This will be the dataset we will use to build our model and make prediction because we have information on the users’ home city and travel city preferences.
We will now build our model to make predictions for yelpers.
Business effect is the average stars the business receives from the reviews in our dataset.
#filter just the dining businesses
business_food <- business_full %>%
filter(grepl("Restaurants", as.character(categories)) |
grepl("Cafe", as.character(categories)))
set_users <- unique(yc_locals_trst$user_id)
mu <- mean(reviews %>% filter(!(user_id %in% set_users)) %>% .$stars)
business2 <- reviews %>% filter(!(user_id %in% set_users)) %>% group_by(business_id) %>%
summarise(business_stars = mean(stars),b_n=n()) %>%
left_join(select(business_food, business_id, yelp_city, attributes),
select(.,business_id,business_stars,b_n),by="business_id")
Tourist effect is the average of the stars for reviews by a tourist for each business.
business2 <- reviews %>%
filter(!(user_id %in% set_users)) %>%
left_join(select(business2,business_id,yelp_city))%>%
left_join(users_trstvsloc,by=c("user_id","yelp_city"))%>%
filter(pred_tourist=="tourist") %>% group_by(business_id) %>%
summarise(tourist_stars = mean(stars),t_n=n()) %>%
left_join(business2,select(.,business_id,tourist_i, t_n),by="business_id")
#if there weren't any tourists who reviewed the place, set the tourist stars equal to the mean for the business
business2$tourist_stars[is.na(business2$tourist_stars)] <- business2$business_stars[is.na(business2$tourist_stars)]
This is the average ratings for users in their home city.
user_effects <- select(reviews,review_id,user_id,business_id,stars) %>%
filter(user_id %in% set_users) %>%
left_join(select(business2,business_id,yelp_city)) %>%
left_join(users_trstvsloc,by=c("user_id","yelp_city")) %>%
filter(pred_tourist=="local") %>%
group_by(user_id) %>%
summarise(user_local_stars = mean(stars),user_local_n=n())
This is the average ratings for users by business category in their home city.
#The user category effect takes the average ratings for each category attribute from the user's home city and connects these averages to the category attributes of the business in the review that is being predicted.
user_categories <- select(reviews,review_id,user_id,business_id,stars) %>%
filter(user_id %in% set_users) %>%
left_join(select(business2,business_id,yelp_city)) %>%
left_join(users_trstvsloc,by=c("user_id","yelp_city")) %>%
filter(pred_tourist=="local") %>%
inner_join(business_categories,.) %>%
filter(!(category %in% c("Restaurants","Cafe","Food"))) %>%
group_by(user_id,category) %>%
summarise(user_category_stars = mean(stars),user_cat_n=n())
This is the average ratings for users by business attributes in their home city.
#make function to make table where each attribute for a business gets a new row
make_attributes <- function(df) {
return_df <- NULL
for(i in 1:nrow(df)) {
atr_i <- unlist(df$attributes[i])
if(!is.null(atr_i)) {return_df <- rbind(return_df,cbind(df$business_id[i],names(atr_i),matrix(atr_i)))}
}
return_df <- data.frame(return_df)
names(return_df) <- c("business_id","attribute","value")
return(return_df)
}
#make each attribute for a business get a new row for the businesses reviewed by the users that have been selected
business_att <- select(reviews, -text) %>%
filter(user_id %in% set_users) %>%
left_join(select(business2, business_id, attributes)) %>%
select(business_id, attributes) %>%
make_attributes
write.csv(business_att, file = "business_att.csv")
business_attributes <- read.csv("business_att.csv")
business_attributes <- business_attributes %>% filter(value != FALSE)
#compute the average for each business attribute by user
user_attributes <- select(reviews,review_id,user_id,business_id,stars) %>%
filter(user_id %in% set_users) %>%
left_join(select(business2,business_id,yelp_city)) %>%
left_join(users_trstvsloc,by=c("user_id","yelp_city")) %>%
filter(pred_tourist=="local") %>%
inner_join(business_attributes,.) %>%
group_by(user_id,attribute,value) %>%
summarise(user_attribute_stars = mean(stars),user_at_n=n())
#The user ambience effect takes the average ratings for each ambience attribute from the user's home city and connects these averages to the ambience attributes of the business in the review that is being predicted.
business_ambience <- select(business_attributes,-value) %>% filter(grepl("Ambience",attribute))
user_ambience <- select(user_attributes,-value) %>% filter(grepl("Ambience",attribute))
ambience_reviews <- reviews %>%
select(.,review_id,user_id,business_id,stars) %>%
filter(user_id %in% set_users) %>%
inner_join(business_ambience,.,by="business_id") %>%
left_join(.,user_ambience, by=c("user_id","attribute")) %>%
mutate(user_ambience_stars = ifelse(is.na(user_attribute_stars),0,user_attribute_stars),
user_ambience_n = ifelse(is.na(user_at_n),0,user_at_n)) %>%
group_by(review_id) %>%
summarise(user_ambience_stars=mean(user_ambience_stars),user_ambience_n=sum(user_ambience_n))
#The user 'good for' effect takes the average ratings for each 'good for' attribute from the user's home city and connects these averages to the 'good for' attributes of the business in the review that is being predicted.
user_good_for <- select(user_attributes,-value) %>% filter(grepl("Good For",attribute))
business_good_for <- select(business_attributes,-value) %>% filter(grepl("Good For",attribute))
good_for_reviews <- reviews %>%
select(.,review_id,user_id,business_id,stars) %>%
filter(user_id %in% set_users) %>%
inner_join(business_good_for,.,by="business_id") %>%
left_join(.,user_good_for, by=c("user_id","attribute")) %>%
mutate(user_good_for_stars = ifelse(is.na(user_attribute_stars),0,user_attribute_stars),
user_good_for_n = ifelse(is.na(user_at_n),0,user_at_n)) %>%
group_by(review_id) %>%
summarise(user_good_for_stars=mean(user_good_for_stars),user_good_for_n=sum(user_good_for_n))
#The user price range effect takes the average ratings for each price range attribute from the user's home city and connects these averages to the price range attributes of the business in the review that is being predicted.
user_price <- user_attributes %>%
rename(price_range=value) %>%
filter(grepl("Price Range",attribute)) %>%
select(.,-attribute)
business_price_ranges <- business_attributes %>%
filter(grepl("Price Range",attribute)) %>%
select(business_id,value) %>%
rename(price_range=value)
price_range_reviews <- reviews %>%
select(.,review_id,user_id,business_id,stars) %>%
filter(user_id %in% set_users) %>%
left_join(.,business_price_ranges) %>%
left_join(.,user_price, by=c("user_id","price_range")) %>%
mutate(user_price_stars = ifelse(is.na(user_attribute_stars),0,user_attribute_stars),
user_price_n = ifelse(is.na(user_at_n),0,user_at_n)) %>%
group_by(review_id) %>%
summarise(user_price_stars=mean(user_price_stars), user_price_n=mean(user_price_n))
#First join eveything except the attribute effects
category_reviews <- reviews %>%
select(.,review_id,user_id,business_id,stars) %>%
filter(user_id %in% set_users) %>%
left_join(.,select(business2,business_id,tourist_stars,t_n,business_stars,b_n,yelp_city)) %>%
left_join(.,users_trstvsloc,by=c("user_id","yelp_city")) %>% filter(pred_tourist=="tourist") %>%
left_join(.,select(user_effects,user_id,user_local_stars,user_local_n)) %>%
inner_join(business_categories,.,by="business_id") %>%
filter(!(category %in% c("Restaurants","Cafe","Food"))) %>%
left_join(.,user_categories,by="category") %>%
mutate(user_category_stars = ifelse(is.na(user_category_stars),0,user_category_stars),
user_cat_n = ifelse(is.na(user_cat_n),0,user_cat_n)) %>%
group_by(review_id,business_id) %>%
summarise(stars=mean(stars),
user_local_stars=mean(user_local_stars),
user_local_n=mean(user_local_n),
tourist_stars=mean(tourist_stars),
t_n=mean(t_n),
business_stars=mean(business_stars),
b_n=mean(b_n),
user_category_stars=mean(user_category_stars),
user_cat_n=sum(user_cat_n))
write.csv(category_reviews, file = "category_reviews.csv")
category_reviews <- read.csv("category_reviews.csv")[-1]
#Now add attributes
review_set <- unique(category_reviews$review_id)
dat <- left_join(category_reviews,ambience_reviews %>% filter(review_id %in% review_set))
dat <- left_join(dat,good_for_reviews %>% filter(review_id %in% review_set))
dat <- left_join(dat,price_range_reviews %>% filter(review_id %in% review_set))
dat <- left_join(dat,select(business_full,business_id,yelp_city))
dat <- left_join(dat,select(reviews,review_id,user_id))
dat <- left_join(dat,select(yc_locals_trst,user_id,yelp_city), by="user_id")
dat <- rename(dat, business_city = yelp_city.x)
dat <- rename(dat, user_city = yelp_city.y)
dat <- dat %>%
mutate(business_stars = ifelse(is.na(business_stars),mu,business_stars)) %>%
mutate(user_local_stars = ifelse(is.na(user_local_stars),mu,user_local_stars)) %>%
mutate(tourist_stars = ifelse(is.na(tourist_stars),business_stars,user_local_stars)) %>%
mutate(user_category_stars = ifelse(is.na(user_category_stars),user_local_stars,user_category_stars)) %>%
mutate(user_ambience_stars = ifelse(is.na(user_ambience_stars),user_local_stars,user_ambience_stars)) %>%
mutate(user_good_for_stars = ifelse(is.na(user_good_for_stars),user_local_stars,user_good_for_stars)) %>%
mutate(user_price_stars = ifelse(is.na(user_price_stars),user_local_stars,user_price_stars)) %>%
mutate(user_local_n = ifelse(is.na(user_local_n),0,user_local_n)) %>%
mutate(t_n = ifelse(is.na(t_n),0,t_n)) %>%
mutate(b_n = ifelse(is.na(b_n),0,b_n)) %>%
mutate(user_cat_n = ifelse(is.na(user_cat_n),0,user_cat_n)) %>%
mutate(user_ambience_n = ifelse(is.na(user_ambience_n),0,user_ambience_n)) %>%
mutate(user_good_for_n = ifelse(is.na(user_good_for_n),0,user_good_for_n)) %>%
mutate(user_price_n = ifelse(is.na(user_price_n),0,user_price_n))
write_csv(dat, "final_dataset.csv")
dat <- read_csv("final_dataset.csv")
set.seed(1)
inTrain <- createDataPartition(y = dat$stars, p=0.5)
train_reviews <- slice(dat, inTrain$Resample1)
test_reviews <- slice(dat, -inTrain$Resample1)
#We model the data using multinomial logistic regression to predict the star category. We use lasso with cross-validation to chose the parameters.
x <- as.matrix(train_reviews[,4:17])
multinomial_lasso <- glmnet(x,y=as.factor(train_reviews$stars),alpha=1,family='multinomial')
plot(multinomial_lasso,xvar="lambda", main="Coefficients versus Log Lambda")
grid()
#CV to pick penalty
cv.multinomial.lasso <- cv.glmnet(x,y=train_reviews$stars,alpha=1)
plot(cv.multinomial.lasso, main="MSE vs Log Lambda")
best_multinomial_lambda <- cv.multinomial.lasso$lambda.min
#run with best lambda
lasso_multinomial_best <-glmnet(x,y=as.factor(train_reviews$stars),alpha=1,family='multinomial',
lambda=best_multinomial_lambda)
#betas
lasso_multinomial_best$beta
## $`1`
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## user_local_stars -0.419521561
## user_local_n -0.002189755
## tourist_stars -0.001861042
## t_n .
## business_stars -0.715140754
## b_n .
## user_category_stars .
## user_cat_n .
## user_ambience_stars .
## user_ambience_n .
## user_good_for_stars .
## user_good_for_n .
## user_price_stars .
## user_price_n .
##
## $`2`
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## user_local_stars -0.31196230
## user_local_n .
## tourist_stars .
## t_n .
## business_stars -0.40824287
## b_n .
## user_category_stars -0.35694578
## user_cat_n .
## user_ambience_stars .
## user_ambience_n .
## user_good_for_stars .
## user_good_for_n .
## user_price_stars -0.02990086
## user_price_n .
##
## $`3`
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## user_local_stars .
## user_local_n 1.280848e-03
## tourist_stars .
## t_n .
## business_stars .
## b_n -6.326671e-06
## user_category_stars .
## user_cat_n 1.511592e-07
## user_ambience_stars .
## user_ambience_n 1.997905e-04
## user_good_for_stars 6.746205e-03
## user_good_for_n .
## user_price_stars .
## user_price_n 1.668073e-03
##
## $`4`
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## user_local_stars .
## user_local_n 0.001101784
## tourist_stars 0.284612578
## t_n .
## business_stars 0.675353500
## b_n .
## user_category_stars .
## user_cat_n .
## user_ambience_stars 0.011842926
## user_ambience_n .
## user_good_for_stars 0.004358944
## user_good_for_n .
## user_price_stars 0.016205921
## user_price_n .
##
## $`5`
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## user_local_stars 8.733693e-01
## user_local_n .
## tourist_stars .
## t_n .
## business_stars 1.645639e+00
## b_n 5.399750e-05
## user_category_stars .
## user_cat_n -5.237706e-07
## user_ambience_stars .
## user_ambience_n .
## user_good_for_stars .
## user_good_for_n 1.623056e-04
## user_price_stars .
## user_price_n -4.129805e-03
#predictions
train_pred <- predict(lasso_multinomial_best, newx = as.matrix(train_reviews[,4:17]), type = "class")
test_pred <- predict(lasso_multinomial_best, newx = as.matrix(test_reviews[,4:17]), type = "class")
#Confusion Matrix on Train Set
mean(train_pred == train_reviews$stars)
## [1] 0.4101287
tab_train <- table(train_pred, train_reviews$stars)
conf_train <- confusionMatrix(tab_train)
conf_train
## Confusion Matrix and Statistics
##
##
## train_pred 1 2 3 4 5
## 1 3 0 0 0 0
## 2 46 41 24 18 14
## 3 18 42 48 39 9
## 4 167 330 576 951 580
## 5 42 103 210 624 933
##
## Overall Statistics
##
## Accuracy : 0.4101
## 95% CI : (0.3962, 0.4242)
## No Information Rate : 0.3387
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.1344
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.0108696 0.07946 0.055944 0.5827 0.6074
## Specificity 1.0000000 0.97629 0.972727 0.4812 0.7017
## Pos Pred Value 1.0000000 0.28671 0.307692 0.3652 0.4880
## Neg Pred Value 0.9433022 0.89840 0.826255 0.6924 0.7925
## Prevalence 0.0572852 0.10710 0.178082 0.3387 0.3188
## Detection Rate 0.0006227 0.00851 0.009963 0.1974 0.1936
## Detection Prevalence 0.0006227 0.02968 0.032379 0.5405 0.3968
## Balanced Accuracy 0.5054348 0.52787 0.514336 0.5319 0.6546
#Confusion Matrix on Test Set
mean(test_pred == test_reviews$stars)
## [1] 0.407638
tab_test <- table(test_pred, test_reviews$stars)
conf_test <- confusionMatrix(tab_test)
conf_test
## Confusion Matrix and Statistics
##
##
## test_pred 1 2 3 4 5
## 1 4 0 1 0 0
## 2 50 38 28 15 8
## 3 26 42 56 29 8
## 4 180 281 594 943 597
## 5 59 95 196 645 923
##
## Overall Statistics
##
## Accuracy : 0.4076
## 95% CI : (0.3937, 0.4217)
## No Information Rate : 0.3387
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.1312
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.0125392 0.083333 0.06400 0.5778 0.6009
## Specificity 0.9997777 0.976845 0.97337 0.4815 0.6968
## Pos Pred Value 0.8000000 0.273381 0.34783 0.3634 0.4812
## Neg Pred Value 0.9345523 0.910665 0.82414 0.6901 0.7886
## Prevalence 0.0662100 0.094645 0.18161 0.3387 0.3188
## Detection Rate 0.0008302 0.007887 0.01162 0.1957 0.1916
## Detection Prevalence 0.0010378 0.028850 0.03342 0.5386 0.3981
## Balanced Accuracy 0.5061585 0.530089 0.51869 0.5297 0.6489
linear_lasso <- glmnet(x,y=train_reviews$stars,alpha=1)
plot(linear_lasso,xvar="lambda", main="Coefficients versus Log Lambda")
grid()
#CV to pick lambda
cv.linear.lasso <- cv.glmnet(x,y=train_reviews$stars,alpha=1)
plot(cv.linear.lasso, main="MSE vs Log Lambda")
best_lambda <- cv.linear.lasso$lambda.min
#run with best lambda
lasso_linear_best <-glmnet(x,y=train_reviews$stars,alpha=1, lambda=best_lambda)
#betas
lasso_linear_best$beta
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## user_local_stars 4.760840e-01
## user_local_n .
## tourist_stars .
## t_n .
## business_stars 8.104072e-01
## b_n 1.705149e-05
## user_category_stars 1.207575e-01
## user_cat_n -2.466039e-07
## user_ambience_stars .
## user_ambience_n .
## user_good_for_stars .
## user_good_for_n 2.327264e-04
## user_price_stars 1.203507e-03
## user_price_n -1.095969e-03
#predictions
train_pred <- predict(lasso_linear_best, newx = as.matrix(train_reviews[,4:17]), type = "response")
test_pred <- predict(lasso_linear_best, newx = as.matrix(test_reviews[,4:17]), type = "response")
#Confusion Matrix on Train Set
mean(round(train_pred) == train_reviews$stars)
## [1] 0.3497302
tab_train <- table(round(train_pred), train_reviews$stars)
conf_train <- confusionMatrix(tab_train)
conf_train
## Confusion Matrix and Statistics
##
##
## 1 2 3 4 5
## 1 2 0 0 0 0
## 2 26 30 19 6 5
## 3 139 248 307 373 154
## 4 105 231 521 1222 1253
## 5 4 7 11 31 124
##
## Overall Statistics
##
## Accuracy : 0.3497
## 95% CI : (0.3363, 0.3634)
## No Information Rate : 0.3387
## P-Value [Acc > NIR] : 0.05528
##
## Kappa : 0.0802
## Mcnemar's Test P-Value : < 2e-16
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.0072464 0.058140 0.35781 0.7488 0.08073
## Specificity 1.0000000 0.986983 0.76919 0.3377 0.98385
## Pos Pred Value 1.0000000 0.348837 0.25143 0.3667 0.70056
## Neg Pred Value 0.9431063 0.897295 0.84682 0.7241 0.69576
## Prevalence 0.0572852 0.107098 0.17808 0.3387 0.31880
## Detection Rate 0.0004151 0.006227 0.06372 0.2536 0.02574
## Detection Prevalence 0.0004151 0.017850 0.25342 0.6916 0.03674
## Balanced Accuracy 0.5036232 0.522561 0.56350 0.5433 0.53229
#Confusion Matrix on Test Set
mean(round(test_pred) == test_reviews$stars)
## [1] 0.3659195
tab_test <- table(round(test_pred), test_reviews$stars)
conf_test <- confusionMatrix(tab_test)
conf_test
## Confusion Matrix and Statistics
##
##
## 1 2 3 4 5
## 1 1 0 0 0 0
## 2 34 29 17 6 5
## 3 157 199 353 356 150
## 4 126 225 497 1228 1229
## 5 1 3 8 42 152
##
## Overall Statistics
##
## Accuracy : 0.3659
## 95% CI : (0.3523, 0.3797)
## No Information Rate : 0.3387
## P-Value [Acc > NIR] : 3.913e-05
##
## Kappa : 0.1024
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.0031348 0.063596 0.40343 0.7525 0.09896
## Specificity 1.0000000 0.985786 0.78138 0.3481 0.98355
## Pos Pred Value 1.0000000 0.318681 0.29053 0.3716 0.73786
## Neg Pred Value 0.9339838 0.909668 0.85512 0.7330 0.69991
## Prevalence 0.0662100 0.094645 0.18161 0.3387 0.31880
## Detection Rate 0.0002076 0.006019 0.07327 0.2549 0.03155
## Detection Prevalence 0.0002076 0.018888 0.25218 0.6860 0.04276
## Balanced Accuracy 0.5015674 0.524691 0.59241 0.5503 0.54125
#Sam's KNN function
split_data <- function(data,prop) {
n <- nrow(data)
rands <- rnorm(n)
training <- rands < quantile(rands,prop)
return(list("training" = data[training,], "test" = data[training==FALSE,]))
}
train_sam <- function(it,prop,test,training,inputs,outcome,k) {
accuracy <- NULL
for(i in 1:it) {
cross_val_train = split_data(training,prop)
predictions <- knn(cross_val_train$training[,inputs],cross_val_train$test[,inputs],cross_val_train$training[[outcome]],k)
accuracy <- c(accuracy,mean(as.numeric(predictions)==as.numeric(test[[outcome]])))
}
return(accuracy)
}
knn_master <- function(it,prop,test,training,inputs,outcome,min_k,max_k){
ks <- seq(min_k,max_k,50)
accuracy <- rep(0,length(ks))
for(i in 1:length(ks)) {
accuracy_k <- train_sam(it,prop,test,training,inputs,outcome,ks[i])
accuracy[i] <- mean(accuracy_k)
}
return(cbind(ks,accuracy))
}
#CV to check best k
knn_test <- knn_master(10,.1,test_reviews,train_reviews,names(train_reviews)[4:6],"stars",1,251)
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
## Warning in as.numeric(predictions) == as.numeric(test[[outcome]]): longer
## object length is not a multiple of shorter object length
data.frame(knn_test) %>% ggplot(aes(x=ks, y= accuracy)) + geom_line() + labs(x="# K in KNN",y="Cross-validated accuracy on train")
knn_prediction <- knn(train_reviews[,names(train_reviews)[4:6]],test_reviews[,names(test_reviews)[4:6]],train_reviews[["stars"]],50)
#Accuracy
mean(round(as.numeric(knn_prediction))==as.numeric(test_reviews$stars))
## [1] 0.3675799
#remove character variables
train_reviews_rf <- train_reviews[,3:17]
#parallel processing
registerDoMC(cores=3)
#CV to pick best mtry
ctrl = trainControl(method="cv", number=10)
trf = train(factor(stars)~.,
data=train_reviews_rf,
method="rf",
metric="Accuracy",
trControl=ctrl,
allowParallel=TRUE)
print(trf)
## Random Forest
##
## 4818 samples
## 14 predictor
## 5 classes: '1', '2', '3', '4', '5'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4336, 4338, 4336, 4335, 4336, 4336, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.3985074 0.1367105 0.01321344 0.01879675
## 8 0.3883453 0.1304467 0.01740376 0.02445771
## 14 0.3939414 0.1392006 0.01484096 0.02279629
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
#fit with mtry = 2
rf_fit <- randomForest(factor(stars)~., data=train_reviews_rf, ntree = 100, mtry = 2)
#predictions
f_hat_rf <- predict(rf_fit, newdata = train_reviews, type="response")
f_hat_rf2 <- predict(rf_fit, newdata = test_reviews, type="response")
#Confusion Matrix on Train Set
mean(f_hat_rf == train_reviews$stars)
## [1] 0.9956413
tab_rftrain <- table(f_hat_rf, train_reviews$stars)
conf_rftrain <- confusionMatrix(tab_rftrain)
conf_rftrain
## Confusion Matrix and Statistics
##
##
## f_hat_rf 1 2 3 4 5
## 1 275 0 0 0 0
## 2 0 515 0 0 2
## 3 0 0 853 2 1
## 4 1 1 3 1628 7
## 5 0 0 2 2 1526
##
## Overall Statistics
##
## Accuracy : 0.9956
## 95% CI : (0.9933, 0.9973)
## No Information Rate : 0.3387
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9941
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.99638 0.9981 0.9942 0.9975 0.9935
## Specificity 1.00000 0.9995 0.9992 0.9962 0.9988
## Pos Pred Value 1.00000 0.9961 0.9965 0.9927 0.9974
## Neg Pred Value 0.99978 0.9998 0.9987 0.9987 0.9970
## Prevalence 0.05729 0.1071 0.1781 0.3387 0.3188
## Detection Rate 0.05708 0.1069 0.1770 0.3379 0.3167
## Detection Prevalence 0.05708 0.1073 0.1777 0.3404 0.3176
## Balanced Accuracy 0.99819 0.9988 0.9967 0.9969 0.9961
#Confusion Matrix on Test Set
mean(f_hat_rf2 == test_reviews$stars)
## [1] 0.4134496
tab_rftest <- table(f_hat_rf2, test_reviews$stars)
conf_rftest <- confusionMatrix(tab_rftest)
conf_rftest
## Confusion Matrix and Statistics
##
##
## f_hat_rf2 1 2 3 4 5
## 1 17 16 13 8 4
## 2 44 45 55 52 19
## 3 53 65 155 142 62
## 4 126 221 450 884 560
## 5 79 109 202 546 891
##
## Overall Statistics
##
## Accuracy : 0.4134
## 95% CI : (0.3995, 0.4275)
## No Information Rate : 0.3387
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.1603
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.053292 0.09868 0.17714 0.5417 0.5801
## Specificity 0.990887 0.96103 0.91834 0.5741 0.7148
## Pos Pred Value 0.293103 0.20930 0.32495 0.3945 0.4877
## Neg Pred Value 0.936555 0.91071 0.83414 0.7097 0.7844
## Prevalence 0.066210 0.09465 0.18161 0.3387 0.3188
## Detection Rate 0.003528 0.00934 0.03217 0.1835 0.1849
## Detection Prevalence 0.012038 0.04462 0.09900 0.4651 0.3792
## Balanced Accuracy 0.522089 0.52986 0.54774 0.5579 0.6474
We get similar accuracies using the multinomial logistic with the random forest and the multinomial logistic regression. We pick the random forest due to the slightly higher accuracy.
The accuracy is not too high with any of our models. As a compromise, we decided to have our response be whether we would recommend the business or not. We decided to recommend the business only if the number of stars is 5 and fit a new model with the random forest.
train_reviews_rf <- train_reviews_rf %>% mutate(recommend = ifelse(stars>4, 1,0))
train_reviews <- train_reviews %>% mutate(recommend = ifelse(stars>4, 1,0))
test_reviews <- test_reviews %>% mutate(recommend = ifelse(stars>4, 1,0))
#CV to pick mtry
ctrl2 = trainControl(method="cv", number=10)
trf2 = train(factor(recommend)~.,
data=train_reviews_rf,
method="rf",
metric="Accuracy",
trControl=ctrl2,
allowParallel=TRUE)
print(trf2)
## Random Forest
##
## 4818 samples
## 15 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4337, 4336, 4337, 4336, 4336, 4335, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 1 1 0 0
## 8 1 1 0 0
## 15 1 1 0 0
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
#run Random Forest
rf_fit2 <- randomForest(factor(recommend)~., data=train_reviews_rf, ntree = 100, mtry = 2)
#predict on train and test set
f_hat_rf <- predict(rf_fit2, newdata = train_reviews, type="response")
f_hat_rf2 <- predict(rf_fit2, newdata = test_reviews, type="response")
#Confusion Matrix on Train Set
mean(f_hat_rf == train_reviews$recommend)
## [1] 1
tab_rftrain <- table(f_hat_rf, train_reviews$recommend)
conf_rftrain <- confusionMatrix(tab_rftrain)
conf_rftrain
## Confusion Matrix and Statistics
##
##
## f_hat_rf 0 1
## 0 3282 0
## 1 0 1536
##
## Accuracy : 1
## 95% CI : (0.9992, 1)
## No Information Rate : 0.6812
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.6812
## Detection Rate : 0.6812
## Detection Prevalence : 0.6812
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
#Confusion Matrix on Test Set
mean(f_hat_rf2 == test_reviews$recommend)
## [1] 1
tab_rftest <- table(f_hat_rf2, test_reviews$recommend)
conf_rftest <- confusionMatrix(tab_rftest)
conf_rftest
## Confusion Matrix and Statistics
##
##
## f_hat_rf2 0 1
## 0 3282 0
## 1 0 1536
##
## Accuracy : 1
## 95% CI : (0.9992, 1)
## No Information Rate : 0.6812
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.6812
## Detection Rate : 0.6812
## Detection Prevalence : 0.6812
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
This shows it’s much easier to identify a rating of 5 to use for highly recommended restaurants than it is to predict the exact star rating.
city_longlat <- matrix(c("Pittsburgh", 40.4406, -79.9959,
"Charlotte", 35.2271, -80.8431,
"Madison", 43.0731, -89.4012,
"Urbana-Champaign", 40.1106, -88.2073,
"Phoenix", 33.4484, -112.0740,
"Las Vegas", 36.1699, -115.1398,
"Montreal", 45.5017, -73.5673,
"Waterloo", 43.4643, -80.5204,
"Edinburgh", 55.9533, -3.1883,
"Karlsruhe", 49.0069, 8.4037),
ncol = 3, byrow = TRUE)
city_longlat <- data.frame(city_longlat)
colnames(city_longlat) <- c("yelp_city", "yc_latitude", "yc_longitude")
#change to numeric
city_longlat <- city_longlat %>% mutate(yc_longitude = as.numeric(levels(yc_longitude))[yc_longitude],
yc_latitude = as.numeric(levels(yc_latitude))[yc_latitude])
#join users that are local and tourists in at least one city each to the reviews and business datasets
locals_map <- yc_locals_trst %>%
left_join(reviews) %>%
inner_join(business_full, by = "business_id")
## Joining by: "user_id"
colnames(locals_map)[2] <- "user_city"
colnames(locals_map)[6] <- "review_stars"
colnames(locals_map)[19] <- "business_stars"
colnames(locals_map)[23] <- "business_city"
#remove business specific geographical information
locals_map1 <- locals_map %>% select(user_id, user_city, business_id, state, business_city, date)
#keep only US
locals_map_travel <- filter(locals_map1, user_city != business_city)
locals_map_travel <- locals_map_travel %>% left_join(city_longlat, by = c("user_city" = "yelp_city"))
locals_map_travel <- locals_map_travel[, c(1,2, 8, 7, 3:6)]
colnames(locals_map_travel)[3:4] <- c("uc_lon", "uc_lat")
locals_map_travel <- locals_map_travel %>% left_join(city_longlat, by = c("business_city" = "yelp_city"))
locals_map_travel <- locals_map_travel[, c(1:7, 10, 9, 8)]
colnames(locals_map_travel)[8:9] <- c("bc_lon", "bc_lat")
#gather information on how many yelp users traveled to what cities
locals_map_travel2 <- locals_map_travel %>%
group_by(user_city, business_city) %>%
mutate(num_travel = n()) %>%
select(user_city, uc_lon, uc_lat, business_city, bc_lon, bc_lat, state, num_travel) %>% unique
#color code US cities
us_city <- c("Pittsburgh","Charlotte","Madison","Urbana-Champaign","Phoenix","Las Vegas")
#population data in 100,000s by MSA in 2010/2014
us_city_pop <- c(23, 22, 8, 2, 20, 42)
us_city_color <- c("midnightblue", "purple4", "blue", "seagreen", "red", "brown")
us_city_color <- data.frame(cbind(us_city, us_city_pop, us_city_color))
us_city_color <- us_city_color %>% left_join(city_longlat, by = c("us_city"="yelp_city")) %>%
mutate(us_city_pop = as.numeric(as.character(us_city_pop)))
## Warning in left_join_impl(x, y, by$x, by$y): joining factors with different
## levels, coercing to character vector
#join with travel data, get number of people traveling from a city divided by the city's population
us_travel <- locals_map_travel2 %>%
filter((user_city %in% us_city) & (business_city %in% us_city)) %>%
left_join(us_city_color, by=c("user_city" = "us_city")) %>%
mutate(pop_num = num_travel/us_city_pop)
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
Here we examine the number of yelpers in our dataset traveling within the US. Each map shows people traveling into a city from the other 5 cities in the US. The transparency of the lines connecting two cities represents the number of users traveling (lower transparency means fewer people).
dev.off()
## null device
## 1
for(i in 1:length(us_city)){
par(mar=c(1,1,1,1))
map <- map("state", col="mintcream", fill=TRUE, bg="white", lwd=0.05)
points(us_city_color$yc_longitude,
us_city_color$yc_latitude,
pch=16,cex=1.5,
col = as.character(us_city_color$us_city_color))
dest_city <- filter(us_travel, business_city == us_city[i])
plt <- colorRampPalette(c("#f2f2f2", as.character(us_city_color$us_city_color[i])))
colors <- plt(max(dest_city$pop_num))
for (j in 1:nrow(dest_city)) {
inter <- gcIntermediate(c(dest_city$uc_lon[j], dest_city$uc_lat[j]),
c(dest_city$bc_lon[j], dest_city$bc_lat[j]),
n=100, addStartEnd=TRUE)
c_index <- round(dest_city$pop_num[j])
lines(inter, col=colors[c_index], lwd=2)
}
text(us_city_color$yc_longitude,
us_city_color$yc_latitude,
us_city_color$us_city,
cex=0.5,adj=0,pos=1,col="black")
title(main = paste("Map of Yelp Users Traveling to", us_city[i]), cex.main = 1, line = 1)
}
There are a lot of people traveling from Phoenix to almost all of the other cities. Most of the other travels seem to be related to the physical closeness of the cities.
Now we look at all the cities in our dataset and try to examine whether there are areas in the city that are preferred by tourists versus locals.
all_map <- users_trstvsloc %>%
left_join(reviews) %>%
inner_join(business_full, by = "business_id")
## Joining by: "user_id"
colnames(all_map)[2] <- "user_city"
colnames(all_map)[5] <- "review_stars"
colnames(all_map)[18] <- "business_stars"
colnames(all_map)[22] <- "business_city"
all_map2 <- select(all_map, user_id, user_city, review_id:business_id, review_count, longitude,
latitude, state, business_stars, business_city)
rm(all_map)
#create indicator variables for tourists vs. locals
all_map2 <- all_map2 %>% mutate(tourist = (user_city == business_city)) %>%
mutate(longitude = as.numeric(longitude), latitude = as.numeric(latitude))
#get average ratings by tourists vs. locals
all_map3 <- all_map2 %>%
group_by(business_id, tourist) %>%
summarise(tourist_stars = mean(review_stars), tourist_review_count = n())
#create color coding and labels for the reviewer (locals vs. tourists)
all_map4 <- full_join(all_map2, all_map3) %>%
select(business_id, review_count, longitude, latitude, state,
business_stars, business_city, tourist, tourist_stars) %>%
unique %>%
mutate(cr = ifelse(tourist, "blue", "yellow"),
reviewer = ifelse(tourist, "tourist", "local")) %>% na.omit
## Joining by: c("business_id", "tourist")
world_city <- c("Pittsburgh","Charlotte", "Madison", "Urbana-Champaign", "Phoenix", "Las Vegas","Montreal",
"Waterloo", "Edinburgh","Karlsruhe")
#for each city map the locations reviewed by tourists vs locals, blue tourists, yellow locals
for(i in 1:length(world_city)){
dest_city <- all_map4 %>% filter(business_city == world_city[i])
new_map <-leaflet(data = dest_city) %>%
addTiles(urlTemplate = "http://{s}.tiles.wmflabs.org/bw-mapnik/{z}/{x}/{y}.png",
attribution = '© <a href="http://www.openstreetmap.org/copyright">OpenStreetMap</a>') %>%
addCircles(~longitude, ~latitude, color = ~cr, opacity = 1)
print(new_map)
}
We cannot detect specific areas of the cities tourists might prefer. For all locations, downtown areas seem popular with both groups of users.
We now visualize tourist preferred businesses in a different way. Here the businesses that are rated higher by tourists than their overall averages are shown in green and the businesses that are rated lower by tourists than their overall averages are shown in red.
dev.off()
## null device
## 1
business_tourist_effect <- read_csv("business_w_tourist_effect.csv")
all_map8 <- left_join(business_tourist_effect,
unique(select(all_map2, business_id, longitude, latitude, business_city)))
## Joining by: "business_id"
all_map8 <- all_map8 %>% mutate(cr = ifelse(tourist_i <= 0, "red", "green"))
for(i in 1:length(world_city)){
dest_city <- all_map8 %>% filter(business_city == world_city[i])
new_map <- leaflet(data = dest_city) %>%
addTiles(urlTemplate = "http://{s}.tiles.wmflabs.org/bw-mapnik/{z}/{x}/{y}.png",
attribution = '© <a href="http://www.openstreetmap.org/copyright">OpenStreetMap</a>') %>%
addCircles(~longitude, ~latitude, color = ~cr, opacity = 1)
print(new_map)
}
Again, we cannot detect specific areas of the cities tourists might prefer.
We now examine the ratings by tourists vs. locals in top 25 food categories for each city.
#filter only food businesses
business_food <- business_full %>%
filter(grepl("Restaurants", as.character(categories)) |
grepl("Cafe", as.character(categories)))
#create dataset with indicator variables for each food category as before
business_food_cat <- cbind(unlist(rep(business_food$business_id,lapply(business_food$categories,length))),
unlist(business_food$categories))
business_food_cat <- data.frame(business_food_cat)
colnames(business_food_cat) <- c("business_id", "category")
all_map5 <- inner_join(all_map4, business_food_cat)
## Joining by: "business_id"
## Warning in inner_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
#plot
for(i in 1:length(world_city)){
dest_city <- filter(all_map5, business_city == world_city[i])
cats <- dest_city %>%
group_by(category) %>%
summarise(n = n()) %>%
arrange(desc(n))
print(dest_city %>%
filter(category %in% cats$category[1:25]) %>%
group_by(category, tourist) %>%
summarise(stars = mean(tourist_stars), se = 1.96*sd(tourist_stars)/sqrt(n())) %>%
ggplot(aes(x = category, y = stars, group = tourist)) +
geom_point(aes(color = tourist)) +
geom_errorbar(aes(ymax = stars + se, ymin=stars - se), width=0.2) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Food Category", y = "Average Stars",
title = paste("Locals vs. Tourist ratings in", world_city[i], "in Top 25 Categories")))
}
## Warning: Removed 1 rows containing missing values (geom_errorbar).
There are some interesting patterns in how different categories are rated across the categories but there’s not a lot of difference since the confidence intervals overlap for most data points.
We now examine tourist vs locals preference at a finer level by plotting average tourist vs local rating for each business.
all_map6 <- all_map4 %>% filter(business_id %in% unique(business_food$business_id))
all_map6 <- all_map6 %>% group_by(business_id) %>% filter(n() == 2)
all_map6 <- all_map6 %>% mutate(tourist = ifelse(tourist, 1, 0))
all_map6 <- all_map6 %>% select(business_id, business_city, tourist, tourist_stars)
all_map7 <- all_map6 %>% spread(tourist, tourist_stars, convert = TRUE)
colnames(all_map7)[3] <- "avg_local_star"
colnames(all_map7)[4] <- "avg_trst_star"
for(i in 1:length(world_city)){
dest_city <- filter(all_map7, business_city == world_city[i])
print(dest_city %>%
ggplot(aes(x = avg_local_star, y=avg_trst_star)) +
geom_point(colour = "seagreen", alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color="red", linetype="dashed", size=0.5) +
labs(x = "Average Local Star", y = "Average Tourist Star",
title = paste("Locals vs. Tourist ratings by business in", world_city[i])))
}
Most of the data points seem to be correlated and businesses rated higher by locals are also rated higher by tourists. For the businesses that are rated low by locals, tourists seem to be more generous in general. This applies less to businesses rated low by tourists.